### This script takes in raw import/export data from the Chinese Customs Data dataset
### and produces a simplified measure of the share of imports tied to export-processing
### trade by industry and year. Note: the script draws on proprietary data not included
### with the replication files. For more information, see the README.txt included with
### the materials and replication_log.txt

rm(list = ls())
setwd("/Users/John/Dropbox/") # Set custom working directory

# --- Load Required Packages ---
library(dplyr)
library(readr)
library(stringr)
library(vroom)

# --- Load correspondence table ---
industrycode <- read_csv("JOP_Replication_Materials/data/raw/isic_hs4_correspondence.csv") %>%
  mutate(
    isic = str_pad(isic, 4, pad = "0"),
    hs4 = str_pad(hs4, 4, pad = "0")
  )

# --- Function to take in import-export data and create processing share measure ---
process_year <- function(year, data_path, codebook) {
  message("🔄 Processing year: ", year)
  
  # Build file path
  file_path <- file.path(data_path, paste0(year, ".csv"))
  
  # Read data with `value` as character to manually validate it
  df <- suppressWarnings(
    vroom::vroom(
      file_path,
      na = c(".", "", "Z"),
      col_types = cols(
        .default = col_guess(),
        value = col_character()  # ← this prevents pre-parsing issues
      ),
      show_col_types = FALSE,
      altrep = FALSE
    )
  )
  
  # Clean and format
  df <- df %>%
    filter(exp_imp == 1) %>%
    filter(grepl("^\\d+(\\.\\d+)?$", value)) %>%  # Keep clean numerics
    mutate(
      value = as.numeric(value),
      hs = ifelse(nchar(as.character(hs)) == 6,
                  str_pad(as.character(hs), 6, pad = "0"),
                  str_pad(as.character(hs), 8, pad = "0")),
      hs4 = substr(hs, 1, 4),
      processtrade = ifelse(tradetype %in% c("14", "15"), 1, 0) # Subset to imports tied to export-processing trade
    )
  
  # Join with correspondence table and remove unmatched
  df <- df %>%
    left_join(codebook, by = "hs4", relationship = "many-to-many") %>%
    filter(!is.na(isic))
  
  # Summarize
  total_val <- df %>%
    group_by(isic) %>%
    summarise(total_value = sum(value, na.rm = TRUE), .groups = "drop")
  
  process_val <- df %>%
    filter(processtrade == 1) %>%
    group_by(isic) %>%
    summarise(process_value = sum(value, na.rm = TRUE), .groups = "drop")
  
  # Merge and calculate processing share
  out <- total_val %>%
    left_join(process_val, by = "isic") %>%
    mutate(
      process_value = ifelse(is.na(process_value), 0, process_value),
      processing_share = round((process_value / total_value) * 100, 3),
      year = year
    ) %>%
    tibble::as_tibble() %>%
    dplyr::select(year, isic, processing_share)
  
  return(out)
}

# --- Loop over years ---
years <- 1998:2013
data_dir <- "JOP Submission/JOP Revision 2/Final_Dataset_JOP/ccd_full"

# --- Store yearly outputs ---
yearly_results <- lapply(years, process_year, data_path = data_dir, codebook = industrycode)

# --- Bind all years ---
final <- bind_rows(yearly_results) %>%
  dplyr::select(year, isic, processing_share) %>%
  arrange(isic, year)

# --- Write to file ---
write_csv(final, "JOP_Replication_Materials/data/processed/ccd_isic_processing.csv")

# -- Print Table for Log File ---

# Load raw 2005 CCD data (example)
ccd2005 <- read_csv("JOP Submission/JOP Revision 2/Final_Dataset_JOP/ccd_full/2005.csv")
summary(ccd2005)
nrow(ccd2005)

# Print head of 2005 CCD data (example)
print(head(ccd2005, 10))

# Print head of final data (example)
print(head(final, 10))
